1 Setup

#libraries 
suppressPackageStartupMessages({
  library(here)
  library(ggplot2)
  library(purrr)
  library(tidyverse)
  library(UpSetR)
})

#global objects used later in functions
#DE files
files <- list.files(here("data/analysis/DE"), pattern = "genes\\.csv$", full.names = TRUE)

#cell wall genes
cw <- read_tsv(here("data/analysis/cell-wall/goi-vst-expression_with-geneID.tsv"))

#sets (comparisons)
sets <- c("120h-vs-72h", "72h-vs-24h", "24h-vs-12h", "12h-vs-4h", "4h-vs-1h", "1h-vs-std")

2 Preparations

Blow are the functions used for data preparations and plotting.

  • For Volcano plots we use padj < 0.001 and abs(log2FoldChange) >=1 as the significance level.
  • For Upset plots we start by reading the DE genes and prepare a binary matrix for overlapping genes in various comparisons.
#functions

# function to plot the volcano plot
fun.volcano <- function(comparison, padj=0.001){
  file <- grep(pattern = comparison, files, value = TRUE)
  data <- read.csv(file)
  data$significance <- ifelse(data$padj < padj & abs(data$log2FoldChange) >= 1,
                              ifelse(data$log2FoldChange >= 1, "up", "down"),
                              "not significant")
  x_down <- min(data$log2FoldChange) %>% round()
  x_up <- max(data$log2FoldChange) %>% round()
  y <- -log10(min(data$padj)) %>% round()
  
  # Create a data frame for the text labels
  text_data <- data.frame(
    x = c(x_down, x_up, 0),
    y = c(y, y, y),
    label = c(paste(sum(data$significance == "down"), "\n genes"),
              paste(sum(data$significance == "up"), "\n genes"),
              paste("padj <", padj))
  )
  ggplot(data, aes(x = log2FoldChange, y = -log10(padj))) +
    geom_point(aes(color = significance), alpha = 0.5) +
    scale_color_manual(values = c("up" = "red", "down" = "blue", "not significant" = "grey")) +
    geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "grey") +
    geom_hline(yintercept = -log10(0.001), linetype = "dashed", color = "grey") +
    geom_text(data = text_data, aes(x = x, y = y, label = label)) +
    xlab("Log2 Fold Change") +
    ylab("-Log10 Adjusted P-value")
}

#function to calculate the frequency matrix for upset plot
fun.mat <- function(condition){
  #list of dataframes
  df_list <- lapply(files, function(file) {
    data <- read.csv(file)
    if (!is.null(condition)) {
      rows <- eval(parse(text = condition), envir = data)
      gene_names <- data[rows, 1]
    } else {
      gene_names <- data[,1]
    }
    df <- data.frame(ID = gene_names, comparison = 1)
    colnames(df) <- c("ID", gsub(".*Time-(.*)genes\\.csv", "\\1", file))
    return(df)
  })
  names(df_list) <- gsub(".*Time-(.*)genes\\.csv", "\\1", files)
  
  mat <- reduce(df_list, full_join, by = "ID")
  mat[is.na(mat)] <- 0
  return(mat)
}

#upset plot function
fun.upset <- function(mat = mat, col="black", meta = FALSE) {
  if(!isTRUE(meta)){
    metadata <- NULL
  } else {
    cw_genes <- sapply(sets , function(x) {
      file <- grep(pattern = x, files, value = TRUE)
      genes <- read.csv(file) %>% .$X
      cw_genes <- intersect(cw$ID, intersect(genes, mat$ID))
      length(cw_genes)
    })
    metadata.cw <- data.frame(sets = sets, cw_genes = cw_genes)
    metadata <- list(data = metadata.cw, plots = list(list(type = "hist", column = "cw_genes", assign = 20, fontsize = 10)))
  }
  upset(mat, 
        order.by = c("freq","degree"), 
        sets = sets[-1],
        keep.order = TRUE, 
        mainbar.y.label = "Number of DE genes (exclusive)", 
        sets.x.label = "Number of DE genes", 
        point.size = 7, 
        shade.color = "grey50", 
        main.bar.color = col,
        nintersects = 20, 
        text.scale = c(1.8,1.4,1.5,1.4,1.5,1.3),
        set.metadata = metadata) 
}

3 Volcano plots

1h-vs-std
fun.volcano(comparison = "1h-vs-std")

4h-vs-1h
fun.volcano(comparison = "4h-vs-1h")

12h-vs-4h
fun.volcano(comparison = "12h-vs-4h")

24h-vs-12
fun.volcano(comparison = "24h-vs-12")

72h-vs-24h
fun.volcano(comparison = "72h-vs-24h")

120h-vs-72h
fun.volcano(comparison = "120h-vs-72h")

4 intersections (upset plots)

First we look at all intersections

mat <- fun.mat(condition = NULL)

upset(mat, 
      sets = sets,
      keep.order = TRUE, 
      mainbar.y.label = "Number of DE genes (exclusive)", 
      sets.x.label = "Number of DE genes", 
      shade.color = "grey50", 
      text.scale = c(1.8,1.4,1.5,1.4,1.5,1.3))

We might want to drop the comparisons between 120h and 72h, order the intersections and limit their numbers to the top 20 ones.

⚠️For easier evaluations, we first order the bars by frequency and then by degree. This means that we will have first the bars for individual comparisons (ordered by frequency) and then overlapping of two comparisons followed by the group of three comparisons.

Below, we examine these selected comparisons for up-regulated and down-regulated genes, both collectively and separately.

All genes
fun.upset(mat = mat)

Up regulated genes
mat_up <- fun.mat(condition = "data$log2FoldChange >= 1")
fun.upset(mat = mat_up, col = "red")

Down regulated genes
mat_dn <- fun.mat(condition = "data$log2FoldChange <= 1")
fun.upset(mat = mat_dn, col = "blue")

4.1 Cell wall integration

It would be interesting to see the relationship to the cell wall related genes

All DE genes
fun.upset(mat = mat, meta = TRUE)

Up regulated DE genes
fun.upset(mat = mat_up, meta = TRUE, col = "red")

Down regulated DE genes
fun.upset(mat = mat_dn, meta = TRUE, col = "blue")

5 Session Info

Session Info
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0    lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  
##  [5] dplyr_1.1.2     readr_2.1.4     tidyr_1.3.0     tibble_3.2.1   
##  [9] tidyverse_2.0.0 purrr_1.0.1     ggplot2_3.4.2   here_1.0.1     
## 
## loaded via a namespace (and not attached):
##  [1] emoji_15.0        sass_0.4.7        utf8_1.2.3        generics_0.1.3   
##  [5] stringi_1.7.12    hms_1.1.3         digest_0.6.33     magrittr_2.0.3   
##  [9] evaluate_0.21     grid_4.3.1        timechange_0.2.0  fastmap_1.1.1    
## [13] rprojroot_2.0.3   plyr_1.8.8        jsonlite_1.8.7    gridExtra_2.3    
## [17] fansi_1.0.4       scales_1.2.1      jquerylib_0.1.4   cli_3.6.1        
## [21] rlang_1.1.1       crayon_1.5.2      bit64_4.0.5       munsell_0.5.0    
## [25] withr_2.5.0       cachem_1.0.8      yaml_2.3.7        parallel_4.3.1   
## [29] tools_4.3.1       tzdb_0.4.0        colorspace_2.1-0  vctrs_0.6.3      
## [33] R6_2.5.1          lifecycle_1.0.3   bit_4.0.5         vroom_1.6.3      
## [37] pkgconfig_2.0.3   pillar_1.9.0      bslib_0.5.0       gtable_0.3.3     
## [41] glue_1.6.2        Rcpp_1.0.11       highr_0.10        xfun_0.39        
## [45] tidyselect_1.2.0  rstudioapi_0.15.0 knitr_1.43        farver_2.1.1     
## [49] htmltools_0.5.5   labeling_0.4.2    rmarkdown_2.23    compiler_4.3.1

 

 

drawing

Created by Aman Zare

aman.zare@umu.se